from bs4 import BeautifulSoup
import re

findLink = re.compile(r'<a class="" href="(.*?)">')
findPic = re.compile(r'<img.*src="(.*?)" width="100"/>')
findTitle = re.compile(r'<span class="title">(.*)</span>')
findSurvey = re.compile(r'<p class="">(.*?)</p>', re.S)
findRateNum = re.compile(r'<span class="rating_num".*>(.*?)</span>')
findCommentNum = re.compile(r'<span>(.*?)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*?)</span>')


html = ""
with open("index.html", "r",encoding="utf-8") as f:
    html = f.read()

bs = BeautifulSoup(html, "html.parser")

for item in bs.findAll("div", class_="item"):
    movie = str(item)

    # 电影的链接
    link = re.findall(findLink, movie)

    #图片的链接
    pic = re.findall(findPic, movie)

    # 电影的名称
    title = re.findall(findTitle, movie)
    if len(title) == 2: 
        tmp = re.sub("/", "", title[1])
        title[1]= tmp.replace("\xa0", "") 
    else:
        title.add(" ")

    # 电影的概况
    survey = re.findall(findSurvey, movie)
    survey[0] = survey[0].replace("<br/>", "")
    survey[0] = " ".join(survey[0].split())
    survey[0] = survey[0].strip()
    
    # 电影评分
    rateNum = re.findall(findRateNum, movie)
    
    # 电影评价人数
    commentNum = re.findall(findCommentNum, movie)
    
    # 电影的描述 
    inq = re.findall(findInq, movie)
    tmp = inq[0]
    tmp = tmp.replace("。", "")
    inq[0] = tmp
    # print(link)
    # print(pic)
    # print(title)
    # print(survey)
    # print(commentNum)
    # print(inq)
    break;